#!/bin/bash
set -x

export VLLM_ENGINE_ITERATION_TIMEOUT_S=36000
export VLLM_RPC_TIMEOUT=36000000
export VLLM_ENFORCE_CUDA_GRAPH=1



vllm serve /home/weights/Qwen/Qwen2.5-VL-32B-Instruct-AWQ --served_model_name "qwenvl" -tp 4 --max-model-len $[32*1024] --trust-remote-code --host 0.0.0.0 --port 8000 --quantization "awq" --limit-mm-per-prompt '{"image":8,"video":0}' --mm_processor_kwargs '{"min_pixels":200704, "max_pixels":1003520}' --max-seq-len-to-capture $[32*1024] --compilation_config='{"level":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' --async-scheduling

# vllm serve /home/weights/Qwen/Qwen2.5-VL-32B-Instruct-AWQ --served_model_name "qwenvl" -tp 2 --max-model-len $[32*1024] --trust-remote-code --host 0.0.0.0 --port 8000 --quantization "awq" --limit-mm-per-prompt '{"image":8,"video":0}' --mm_processor_kwargs '{"min_pixels":200704, "max_pixels":1003520}'  --max-seq-len-to-capture $[32*1024] --compilation_config='{"level":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' --async-scheduling

